Load the data

Starting with the Schaefer 400-parcel, 7 network atlas.

#This should be easy to generalize
#option to set directory and search path
#option to rad in label file

library(data.table)
library(tidyr)
if(is.na(ncores <- as.numeric(Sys.getenv('SLURM_CPUS_ON_NODE')))){
  ncores <- 1
  message('No environment variable specifying number of cores. Setting to ', ncores, '.')
}
setDTthreads(ncores)

fname <- 'sea_rsfc_schaefer400x7.RDS'
if(file.exists(fname)){
  adt_labels <- readRDS(fname)
} else {
  data_dir <- '/net/holynfs01/srv/export/mclaughlin/share_root/stressdevlab/SEA_BIDS/derivatives/1.4.1-final/xcpengine-default'
  subs <- sprintf('sub-10%02d', c(1:16,18:31))
  sess <- sprintf('ses-%02d', 1:10)
  subses <- expand.grid(sess, subs)
  files <- paste0(data_dir, '/', subses[,2], '/', subses[,1], '/fcon/schaefer400x7/', subses[,2], '_', subses[,1], '_schaefer400x7.net')
  file_id <- paste0(subses[,2], '_', subses[,1])
  
  message('Creating file list...')
  files_list <- lapply(files, function(x) ifelse(file.exists(x), x, ''))
  
  message('Reading rsfc files...')
  if(ncores > 1){
    message('Using ', ncores, ' cores to read files.')
    cl <- parallel::makePSOCKcluster(max(c(ncores - 1), 1))
    
    adt_list <- unlist(parallel::parLapply(cl = cl, split(files, 1:ncores), function(part){
      lapply(part, function(f){
        if(file.exists(f)){
          data.table::fread(f, skip = 2, col.names = c('node1', 'node2', 'r'))
        } else {
          data.table::data.table()
        }
      })
    }), recursive = FALSE)
    try(stop(cl))
  } else {
    adt_list <- lapply(files, function(f){
      if(file.exists(f)){
        data.table::fread(f, skip = 2, col.names = c('node1', 'node2', 'r'))
      } else {
        data.table::data.table()
      }
    }) 
  }
  names(adt_list) <- file_id
  
  message('Combining data, assigning labels...')
  adt <- rbindlist(adt_list, idcol = 'id')
  
  #https://github.com/ThomasYeoLab/CBIG/tree/master/stable_projects/brain_parcellation/Schaefer2018_LocalGlobal/Parcellations/MNI
  schaefer_400_7_net_ids <- fread('Schaefer2018_400Parcels_7Networks_order.txt', 
                                  col.names = c('node1', 'network1', 'V1', 'V2', 'V3', 'V4'))
  schaefer_400_7_net_ids[, c('V1', 'V2', 'V3', 'V4') := NULL]
  schaefer_400_7_net_ids <- schaefer_400_7_net_ids %>% 
    tidyr::extract(network1, c('network1', 'roi1'), '7Networks_([RL]H_.*)_(\\d+)')
  
  setkey(adt, node1)
  setkey(schaefer_400_7_net_ids, node1)
  adt_labels1 <- schaefer_400_7_net_ids[adt]
  setnames(schaefer_400_7_net_ids, c('node2', 'network2', 'roi2'))
  setkey(adt_labels1, node2)
  setkey(schaefer_400_7_net_ids, node2)
  adt_labels <- schaefer_400_7_net_ids[adt_labels1]
  adt_labels[, c('id', 'sess') := tstrsplit(id, '_', fixed = TRUE)]
  message('Saving RDS file to: ', fname)
  saveRDS(adt_labels, fname)
}

Retain just within-network connectivity. Also, Fisher-z transform the correlations for analyses. One small detail that is important here is that we keep network connectivity for same-hemisphere parcels only.

sea_schaefer400_7_withinnet <- copy(adt_labels)

sub_regex <- '[LR]H_[A-Za-z]+_*(.*)'
net_regex <- '[LR]H_([A-Za-z]+)_*(?:.*)'
hem_regex <- '([LR]H)_[A-Za-z]+_*(?:.*)'
sea_schaefer400_7_withinnet_nets <- distinct(sea_schaefer400_7_withinnet, network1)
sea_schaefer400_7_withinnet_nets[, sub1 := gsub(sub_regex, '\\1', network1)]
sea_schaefer400_7_withinnet_nets[, net1 := gsub(net_regex, '\\1', network1)]
sea_schaefer400_7_withinnet_nets[, hem1 := gsub(hem_regex, '\\1', network1)]
setkey(sea_schaefer400_7_withinnet, network1)
setkey(sea_schaefer400_7_withinnet_nets, network1)
sea_schaefer400_7_withinnet <- sea_schaefer400_7_withinnet_nets[sea_schaefer400_7_withinnet]

setnames(sea_schaefer400_7_withinnet_nets, c('network2', 'sub2', 'net2', 'hem2'))
setkey(sea_schaefer400_7_withinnet, network2)
setkey(sea_schaefer400_7_withinnet_nets, network2)
sea_schaefer400_7_withinnet <- sea_schaefer400_7_withinnet_nets[sea_schaefer400_7_withinnet]

sea_schaefer400_7_withinnet <- sea_schaefer400_7_withinnet[net1 == net2 & hem1 == hem2]
sea_schaefer400_7_withinnet[, c('network1', 'network2') := NULL]
sea_schaefer400_7_withinnet[, z := atanh(r)]
sea_schaefer400_7_withinnet[, H := dplyr::case_when(hem1 == 'RH' ~ -1,
                                                    hem1 == 'LH' ~ 1)]
if(!dim(distinct(sea_schaefer400_7_withinnet, net1, net2, hem1, hem2))[[1]] == 14){
  stop('Incorrect number of networks')
}

Some QC

Density of functional connectivity (pearson correlation) for each participant, for each subject, overlayed on the density for all sessions and participants.

#Generate group-level density
agg_mean <- mean(sea_schaefer400_7_withinnet$z)
agg_sd <- sd(sea_schaefer400_7_withinnet$z)
scalez <- (sea_schaefer400_7_withinnet$z - agg_mean) / agg_sd
density_agg <- density(scalez)
sea_schaefer400_densplot_agg <- data.table(x = density_agg$x,
                                           x_on_z = density_agg$x * agg_sd + agg_mean,
                                           y = density_agg$y)
#Generate per-participant-session densities
sea_schaefer400_MSD <- sea_schaefer400_7_withinnet[, list(mean = mean(z),
                                                          sd = sd(z)), 
                                                   by = c('id', 'sess')]
sea_schaefer400_densplot <- sea_schaefer400_MSD[sea_schaefer400_7_withinnet, 
                                                on = c('id', 'sess')]
sea_schaefer400_densplot[, scalez := (z - mean)/sd]
sea_schaefer400_densplot <- 
  sea_schaefer400_MSD[sea_schaefer400_densplot[, list(x = density(scalez)$x,
                                                      y = density(scalez)$y), 
                                               by = c('id', 'sess')],
                      on = c('id', 'sess')][, x_on_z := sd*x + mean]
u_ids <- unique(sea_schaefer400_densplot$id)
for(an_id in u_ids){
  cat(paste0('\n\n## ', an_id, '\n\n'))
  
  hplot <- ggplot(sea_schaefer400_densplot_agg, aes(x = tanh(x_on_z), y = y)) + 
    geom_ribbon(ymin = 0, aes(ymax = y, fill = 'Group', color  = 'Group'), 
                alpha = .5) + 
    geom_ribbon(data = sea_schaefer400_densplot[id == an_id, ], 
                ymin = 0, aes(ymax = y, fill = 'ID', color  = 'ID'), 
                alpha = .5) + 
    scale_fill_manual(aesthetics = c('color','fill'), breaks = c('Group', 'ID'), labels = c('Group', 'Participant'), values = apal[c(2,4)], name = 'Data') +
    facet_wrap(~sess, ncol = 2) + 
    labs(x = 'correlation', y = 'density') + 
    coord_cartesian(xlim = c(-1, 1), ylim = c(0, .5)) + 
    jftheme
  print(hplot)
}

sub-1001

sub-1002

sub-1003

sub-1004

sub-1005

sub-1006

sub-1007

sub-1008

sub-1009

sub-1010

sub-1011

sub-1012

sub-1013

sub-1014

sub-1015

sub-1016

sub-1018

sub-1019

sub-1020

sub-1021

sub-1022

sub-1023

sub-1024

sub-1025

sub-1026

sub-1027

sub-1028

sub-1029

sub-1030

sub-1031

Modeling

We are interested in stability and variability for each network. For each participant, we have multiple sessions, and within each session, we have multiple parcel-parcel pairs that provide information about the within-network connectivity. We can use the fact that we have multiple indicators of within-network connectivity to estimate the between-person variability, as well as the within-person (session-to-session) variability as distinct from measurement error (we assume that each parcel-parcel functional connectivity estimate in a network is an estimate of that network’s connectivity)*.

* This assumption means that we essentially treat differences in the FC between one pair and another as measurement error.

We can compute an ICC that describes both within-person and between-person variability by using a 3 level model. First, I subset the data for a single network. I then build an intercept-only model, allowing the intercept to vary by participant-ID, and by session within each ID. I want to make sure that the intercept is not just the mean across all parcel-pairs, even though the pairs are within-hemisphere. This is because what is typically done (according to Nessa) is that the FC is computed within hemisphere and then averaged across the two hemispheres. To accomplish this we can just inlucde a dummy code for left versus right hemisphere, where RH = -1 and LH = 1. We should let this vary per-person and per-person-session to ensure we accomplish the same thing as if we had computed this separately for each person-session.

\[ \begin{align} z &= \beta_{0jk} + \beta_{1jk}\text{H}_{ijk} + \epsilon_{ijk} \\ \beta_{0jk} &= \pi_{00k} + \nu_{0jk} \\ \beta_{1jk} &= \pi_{10k} + \nu_{1jk} \\ \pi_{00k} &= \gamma_{000} + \upsilon_{00k} \\ \pi_{10k} &= \gamma_{100} + \upsilon_{10k} \end{align} \]

\[ \begin{align} z = &\gamma_{000} + \upsilon_{00k} + \nu_{0jk} + \\ &(\gamma_{100} + \upsilon_{10k} + \nu_{1jk})\text{H}_{ijk} + \\ &\epsilon_{ijk} \end{align} \] Where \(z\) is the measure of functional connectivity, and \(i\) indexes observations within each session, \(j\), for each participant \(k\). This means that we are able to estimate variance in per-person deviations (\(\upsilon_{00k}\)) from the network mean-connectivity (\(\gamma_{000}\)), per-session-deviations (\(\nu_{0jk}\)) from those per-person deviations, and the residual not explained by variability over persons or person-sessions (\(\epsilon_{ijk}\)). All these are adjusted by the the effect of hemisphere at every level.

Testing these out

I want to make sure that these models are correct, so I’ll compare the variance portions, and total variance, between the two models.

library(lme4)
networks <- unique(sea_schaefer400_7_withinnet$net1)
this_net <- networks[[2]]
this_net_dt <- sea_schaefer400_7_withinnet[net1 == this_net] 

model_dir <- 'models'
test_model_list <- list(test_2l = list(fn = file.path(model_dir, 'test_2l.RDS'),
                                         form = z ~ 1 + H + (1 + H || id)),
                          test_3l = list(fn = file.path(model_dir, 'test_3l.RDS'),
                                         form = z ~ 1 + H + (1 + H || id/sess)),
                          test_3l_noh = list(fn = file.path(model_dir, 'test_3l_noh.RDS'),
                                             form = z ~ 1 + (1 | id/sess)))
cl <- parallel::makePSOCKcluster(max(c(ncores - 1), 1))

test_model_fits <- parallel::parLapply(cl = cl, test_model_list, function(model_list, d){
  library(lme4)
  if(!file.exists(model_list[['fn']])){
    f <- model_list[['form']]
    fit <- lmer(f, data = d)
    saveRDS(fit, model_list[['fn']])
  } else {
    fit <- readRDS(model_list[['fn']])
  }
  return(fit)
}, d = this_net_dt)

try(parallel::stopCluster(cl))
three_level <- test_model_fits$test_3l
three_level_noh <- test_model_fits$test_3l_noh

summary(three_level)
## Linear mixed model fit by REML ['lmerMod']
## Formula: z ~ 1 + H + ((1 | id/sess) + (0 + H | id/sess))
##    Data: d
## 
## REML criterion at convergence: 182855.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.8260 -0.6863 -0.0761  0.6112  6.0690 
## 
## Random effects:
##  Groups    Name        Variance  Std.Dev.
##  sess.id   H           5.427e-04 0.023296
##  sess.id.1 (Intercept) 3.114e-03 0.055802
##  id        H           1.334e-04 0.011551
##  id.1      (Intercept) 6.391e-05 0.007994
##  Residual              7.966e-02 0.282237
## Number of obs: 587200, groups:  sess:id, 289; id, 30
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 0.218603   0.003614  60.484
## H           0.002104   0.002546   0.826
## 
## Correlation of Fixed Effects:
##   (Intr)
## H -0.005
## convergence code: 0
## Model failed to converge with max|grad| = 0.00602013 (tol = 0.002, component 1)
summary(three_level_noh)
## Linear mixed model fit by REML ['lmerMod']
## Formula: z ~ 1 + (1 | id/sess)
##    Data: d
## 
## REML criterion at convergence: 186542
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.8941 -0.6869 -0.0774  0.6111  6.0356 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.
##  sess:id  (Intercept) 3.193e-03 0.056508
##  id       (Intercept) 6.238e-06 0.002498
##  Residual             8.027e-02 0.283317
## Number of obs: 587200, groups:  sess:id, 289; id, 30
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 0.219509   0.003376   65.03
lll_varcorr <- VarCorr(three_level)
lll_varcorr_noh <- VarCorr(three_level_noh)

report_df <- data.frame(
  stat = c('id_var', 'sess_var', 'resid_var', 'total_var'),
  three_level = c(lll_varcorr$id.1['(Intercept)','(Intercept)'],
                  lll_varcorr$sess.id.1['(Intercept)','(Intercept)'],
                  sigma(three_level)^2,
                  sum(unlist(lapply(lll_varcorr, diag))) + sigma(three_level)^2),
  three_level_noh = c(lll_varcorr_noh$id['(Intercept)','(Intercept)'],
                      lll_varcorr_noh$`sess:id`['(Intercept)','(Intercept)'],
                      sigma(three_level_noh)^2,
                      sum(unlist(lapply(lll_varcorr_noh, diag))) + sigma(three_level_noh)^2))

report_df$three_level_perc <- 
  sprintf('%.1f%%', report_df$three_level/report_df$three_level[[4]]*100)
report_df$three_level_noh_perc <- 
  sprintf('%.1f%%', report_df$three_level_noh/report_df$three_level_noh[[4]]*100)
report_df$three_level_RE_perc <- 
  c(sprintf('%.1f%%', report_df$three_level[1:2]/sum(report_df$three_level[1:2])*100), '-', '-')
report_df$three_level_RE_noh_perc <- 
  c(sprintf('%.1f%%', report_df$three_level_noh[1:2]/sum(report_df$three_level_noh[1:2])*100), '-', '-')
knitr::kable(report_df, digits = 5)
stat three_level three_level_noh three_level_perc three_level_noh_perc three_level_RE_perc three_level_RE_noh_perc
id_var 0.00006 0.00001 0.1% 0.0% 2.0% 0.2%
sess_var 0.00311 0.00319 3.7% 3.8% 98.0% 99.8%
resid_var 0.07966 0.08027 95.4% 96.2% - -
total_var 0.08351 0.08347 100.0% 100.0% - -

Fit for each network

networks <- unique(sea_schaefer400_7_withinnet$net1)

netlist <- lapply(networks, function(this_net){
  return(list(network = this_net, d = sea_schaefer400_7_withinnet[net1 == this_net]))
})

cl <- parallel::makePSOCKcluster(max(c(ncores - 1, 1)))
model_fits <- parallel::parLapply(cl = cl, netlist, function(anet){
  this_net <- anet[['network']]
  this_net_dt <- anet[['d']]
  model_dir <- 'models'
  model_fit_fn <- file.path(model_dir, paste0('lmer_fit-3l-', this_net, '.RDS'))
  library(lme4)
  if(!file.exists(model_fit_fn)){
    fit <- lmer(z ~ 1 + H + (1 + H || id/sess), data = this_net_dt)
    saveRDS(fit, model_fit_fn)
  } else {
    fit <- readRDS(model_fit_fn)
  }
  return(fit)
})
try(parallel::stopCluster(cl))

proportion_variance_tables <- lapply(model_fits, function(model_fit){
  vc <- VarCorr(model_fit)
  id_v <- vc$id.1['(Intercept)','(Intercept)']
  idsess_v <- vc$sess.id.1['(Intercept)','(Intercept)']
  s_2 <- sigma(model_fit)^2
  total_RE <- sum(unlist(lapply(vc, diag)))
  other_RE <- total_RE - id_v - idsess_v
  total <- total_RE + s_2
  return(data.frame(source = rep(c('ID', 'ID/Session', 'Other RE', 'Residual'),2),
                    out_of = factor(c(rep('total', 4), rep('RE', 4)), levels = c('total', 'RE')),
                    percent = c(c(id_v, idsess_v, other_RE, s_2)*100 / total,
                                c(id_v, idsess_v, other_RE, NA)*100 / total_RE)))
})

The plots on the left, below, show model-expected functional connectivity pearson correlations for each participant, for each session, overlayed on the raw data (one point per parcel, per session, per participant). A line for each participant’s model-expected mean across all sessions is overlayed on this. The right plot shows proportions of variance accounted for by the random effect estimates as a proportion of both total variance, and total random effects (RE) variance. The total RE variance includes individual means (ID), individual means per session (ID/Session), and the difference in connectivity between right and left hemispheres (we treat this as a nuisance variable, essentially).

library(patchwork)

max_points <- unlist(sea_schaefer400_7_withinnet[, .N, by = net1][, list(max = max(N))])

plot_percents <- function(percent_table){
  ggplot(percent_table, aes(y = percent, x = out_of, fill = source)) + 
    geom_col(position = 'stack') + 
    scale_fill_manual(breaks = c('ID', 'ID/Session', 'Other RE', 'Residual'),
                      values = apal[c(4,3,2,1)]) +
    scale_y_continuous(breaks = c(0, 20, 40, 60, 80, 100), labels = paste0(c(0, 20, 40, 60, 80, 100), '%')) + 
    labs(x = 'Variance (denominator)', y = '', fill = 'Variance source') +
    jftheme
}

plot_predictions <- function(amodel, max_points){
  adt <- as.data.table(amodel@frame)
  adt[, wave := factor(as.numeric(gsub('ses-(\\d+)', '\\1', sess)))]
  newdata <- unique(adt, by = c('id', 'sess', 'wave'))[, c('H', 'z') := list(0, NULL)]
  newdata$z_prime <- predict(amodel, newdata = newdata)
  newdata$z_prime_id <- predict(amodel, newdata = newdata, re.form = ~(1|id))
  point_alpha <- .125 - dim(adt)[[1]]/max_points/8 + .01
  ggplot(newdata, aes(x = wave, y = tanh(z_prime), group = id)) + 
    #geom_violin(data = adt, aes(group = NULL, y = tanh(z)), size = 0, alpha = .5, fill = apal[[1]]) +
    geom_point(data = adt, aes(group = NULL, y = tanh(z)), alpha = point_alpha, color = apal[[1]], position = position_jitter(), size = .5) +
    geom_point(color = apal[[3]], alpha = 1) + 
    geom_line(color = apal[[3]]) + 
    geom_rug(data = unique(newdata, by = 'id'), aes(y = z_prime_id, x = NULL), color = apal[[4]], alpha = 1, sides = 'tr', length = unit(0.03, "npc")) + 
    geom_hline(yintercept = 0, color = apal[[1]]) + 
    coord_cartesian(ylim = c(-.025, .65), xlim = c(1, 10)) + 
    labs(y = 'FC correlation', x = 'Session') + 
    jftheme
}

for(i in 1:length(model_fits)){
  model_fit <- model_fits[[i]]
  network_name <- networks[[i]]
  cat('\n\n### ', network_name, '{.tabset}\n\n') 
  cat('\n\n#### Plot\n\n')
  print(plot_predictions(model_fit, max_points = max_points) + plot_percents(proportion_variance_tables[[i]]) +
          plot_layout(ncol = 2, widths = c(4,1)))
  cat('\n\n#### Table (%)\n\n')
  print(knitr::kable(tidyr::spread(proportion_variance_tables[[1]], out_of, percent), digits = 1))

}

Cont

Plot

Table (%)

source total RE
ID 0.9 13.5
ID/Session 4.3 67.2
Other RE 1.2 19.3
Residual 93.7 NA

Default

Plot

Table (%)

source total RE
ID 0.9 13.5
ID/Session 4.3 67.2
Other RE 1.2 19.3
Residual 93.7 NA

DorsAttn

Plot

Table (%)

source total RE
ID 0.9 13.5
ID/Session 4.3 67.2
Other RE 1.2 19.3
Residual 93.7 NA

Limbic

Plot

Table (%)

source total RE
ID 0.9 13.5
ID/Session 4.3 67.2
Other RE 1.2 19.3
Residual 93.7 NA

SalVentAttn

Plot

Table (%)

source total RE
ID 0.9 13.5
ID/Session 4.3 67.2
Other RE 1.2 19.3
Residual 93.7 NA

SomMot

Plot

Table (%)

source total RE
ID 0.9 13.5
ID/Session 4.3 67.2
Other RE 1.2 19.3
Residual 93.7 NA

Vis

Plot

Table (%)

source total RE
ID 0.9 13.5
ID/Session 4.3 67.2
Other RE 1.2 19.3
Residual 93.7 NA